import sys
sys.path.append("..")
from database.db_handler import MysqlHander
from common.my_http import MyHttp
import urllib.request
from bs4 import BeautifulSoup
from common.entity_data import EntityData
import glob
import jieba
import re
import threading
import hashlib

'''
品牌
'''

class TouzijigouData:
    def __init__(self):
        self.entity = EntityData()
        self.url_map = {"互联网":"https://zdb.pedaily.cn/company/h5537",
                        "移动互联网":"https://zdb.pedaily.cn/company/h4823",
                        "IT":"https://zdb.pedaily.cn/company/h690",
                        "房地产":"https://zdb.pedaily.cn/company/h4973",
                        "汽车":"https://zdb.pedaily.cn/company/h5044",
                        "连锁零售":"https://zdb.pedaily.cn/company/h119",
                        "能源及矿产":"https://zdb.pedaily.cn/company/h1154",
                        "广播电视":"https://zdb.pedaily.cn/company/h1381",
                        "娱乐传媒":"https://zdb.pedaily.cn/company/h2164",
                        "物流":"https://zdb.pedaily.cn/company/h2359",
                        "教育与培训":"https://zdb.pedaily.cn/company/h2869",
                        "清洁技术":"https://zdb.pedaily.cn/company/h2947",
                        "农林牧渔":"https://zdb.pedaily.cn/company/h3052",
                        "金融":"https://zdb.pedaily.cn/company/h3362",
                        "食品饮料":"https://zdb.pedaily.cn/company/h3456",
                        "半导体":"https://zdb.pedaily.cn/company/h3601",
                        "医疗健康":"https://zdb.pedaily.cn/company/h3622",
                        "机械制造":"https://zdb.pedaily.cn/company/h4023",
                        "化工原料":"https://zdb.pedaily.cn/company/h4238",
                        "建筑工程":"https://zdb.pedaily.cn/company/h4971",
                        "纺织服装":"https://zdb.pedaily.cn/company/h5643",
                        "光电设备":"https://zdb.pedaily.cn/company/h5851"}

        #self.url_map = {"纺织服装":"https://zdb.pedaily.cn/company/h3052"}
        pass
    
    def get_data(self):
        for k,v in self.url_map.items():
            name_set = ""
            name = k + "行业投资机构"
            for i in range (1,500):
                url = v + "-p" + str(i) + "/"
                bs4_data = MyHttp.bs4_utf8_data(url)
                ul = bs4_data.find(name="ul", attrs={'id':'company-list'})
                #print(ul)
                titles = ul.find_all("h3")
                if len(titles) == 0:
                    break;
                for title in titles:
                    #if a.text == "...":
                    #    continue
                    if title.text == "不公开的投资者":
                        continue
                    name_set += "|" + title.text.split("（")[0]
            print(name + name_set)
            self.entity.add_entity_data(name, "来源：投资界官网", 100, 1000003, "", "投资", name_set)
            self.entity.update_entity_data(name, "来源：投资界官网", 100, 1000003, "", "投资", name_set)

def touzijigou_api():
    d = TouzijigouData()
    d.get_data()

if __name__ == '__main__':
    touzijigou_api()
